Simple vowel case study

#reading in vowel data
vowel_data <- read.csv("results_processwithpraat.csv") |>
  mutate(Neighborhood = case_when(startsWith(Speaker, "CB") ~ "Cranberry Township",
                                  startsWith(Speaker, "FH") ~ "Forest Hills",
                                  startsWith(Speaker, "HD") ~ "Hill District",
                                  startsWith(Speaker, "LV") ~ "Lawrenceville",
                                  TRUE ~ "Interviewer"))

vowel_data_smaller <- vowel_data %>%
  #dropping unnecessary columns
  select(Speaker, Neighborhood, Text, Target.phonemes, starts_with("F"), MatchId)

vowel_data_outliers_removed <-
  vowel_data_smaller %>%
  group_by(Speaker) %>%
  #finding outliers based on all F1 and F2 measurements
  ##Drop speakers w/ fewer than 75 tokens for reliable outlier-checking
  filter(n() >= 75,
         !find_outliers(F1.time_0.2, F2.time_0.2, F1.time_0.5, F2.time_0.5, F1.time_0.8, F2.time_0.8)) %>%
  ungroup()

#ANAE normalization
vowel_data_normed <-
  vowel_data_outliers_removed %>% 
  group_by(Speaker) %>% 
  joeyr::norm_anae(hz_cols = c(F1.time_0.2, F2.time_0.2, F1.time_0.5, F2.time_0.5, F1.time_0.8, F2.time_0.8), token_id = row.names(.), speaker_id = Speaker) %>%
  ungroup() 

#subsetting to remove incomplete words
#I'm unsure of how /r/ or morphology affects this, so words like "how's" and "cow's" have not been removed
#i'm also assuming vowel stress doesn't matter because it wasn't included in the APLS search
vowel_data_subset <- 
  vowel_data_normed %>% 
  filter(!str_detect(Text, "~$")) %>% 
  #adding in a "vowel" column to help with plotting
  mutate(vowel = "AW",
         traj_length = eucl_dist(F2.time_0.2_anae, F1.time_0.2_anae, F2.time_0.5_anae, F1.time_0.5_anae) +
           eucl_dist(F2.time_0.5_anae, F1.time_0.5_anae, F2.time_0.8_anae, F1.time_0.8_anae))

##Longer for geom_paths
vowel_data_longer <-
  vowel_data_subset |>
  pivot_longer(ends_with("_anae"), names_to=c(".value","timestamp"), names_pattern="(F[12]).time_(0\\.\\d)_anae") |>
  ##Remove unnormalized meas
  select(-contains("time_"))
#plotting the vowel data
ggplot(vowel_data_subset, aes(x = F2.time_0.5_anae, y = F1.time_0.5_anae, color = vowel, label = vowel)) +
  #making slightly transparent points for each individual vowel
  geom_point(alpha = 0.2) +
  #making an ellipse for the vowel measurements overall
  stat_ellipse(level = .67, geom = "polygon", alpha = 0.6, aes(fill = vowel)) +
  #using the means data to add a label to the center
  geom_label(data = vowel_data_subset %>% 
               summarise(across(ends_with("0.5_anae"), mean),
                         across(vowel, unique))) +
  #flipping the scales, like a good vowel plot should
  scale_x_reverse("F1 (ANAE-normalized Hz)") + 
  scale_y_reverse("F2 (ANAE-normalized Hz)") +
  theme_classic() + 
  theme(legend.position="none")

vowel_data_longer |>
  filter(Neighborhood != "Interviewer") |>
  arrange(desc(traj_length)) |>
  ggplot(aes(x = F2, y = F1, color=traj_length)) +
  #making slightly transparent points for each individual vowel
  geom_path(aes(group=MatchId), alpha=0.2, arrow=arrow(length=unit(0.05, "inches"))) +
  facet_wrap(~ Neighborhood) +
  #flipping the scales, like a good vowel plot should
  scale_x_reverse("F1 (ANAE-normalized Hz)") + 
  scale_y_reverse("F2 (ANAE-normalized Hz)") +
  labs(color="Trajectory\nlength (Hz)") +
  theme_classic()

Don’t use: Speaker trajectories

vowel_data_means <- 
  vowel_data_longer |>
  summarise(across(Neighborhood, unique),
            across(c(F1, F2, traj_length), mean),
            .by=c(Speaker, timestamp)) |>
  ##Break Cranberry Township across two lines
  mutate(across(Neighborhood, \(x) if_else(x=="Cranberry Township", "Cranberry\nTownship", x)))
vowel_data_means |>
  filter(Neighborhood != "Interviewer") |>
  ggplot(aes(x = F2, y = F1, color=Neighborhood)) +
  geom_path(aes(group=Speaker), alpha=0.5, arrow=arrow(length=unit(0.125, "inches"))) +
  geom_text(data=vowel_data_means |> 
              filter(Neighborhood != "Interviewer",
                     timestamp==0.2),
            aes(label = Speaker), show.legend=FALSE) +
  #flipping the scales, like a good vowel plot should
  scale_x_reverse("F1 (ANAE-normalized Hz)") + 
  scale_y_reverse("F2 (ANAE-normalized Hz)") +
  theme_classic()

But this plot is misleading. It suggests that FH20 has shorter-trajectory tokens and HD09 longer, but in reality, HD09’s mean trajectory length is the shortest, FH20’s the longest:

vowel_data_means |>
  distinct(Speaker, Neighborhood, traj_length) |>
  arrange(traj_length)
# A tibble: 29 × 3
   Speaker        Neighborhood          traj_length
   <chr>          <chr>                       <dbl>
 1 HD09           "Hill District"             1369.
 2 FH17           "Forest Hills"              1554.
 3 HD01           "Hill District"             1572.
 4 LV17           "Lawrenceville"             1587.
 5 LV11           "Lawrenceville"             1592.
 6 HD23           "Hill District"             1608.
 7 HD06           "Hill District"             1616.
 8 Interviewer HD "Interviewer"               1631.
 9 FH27           "Forest Hills"              1743.
10 CB10           "Cranberry\nTownship"       1765.
# ℹ 19 more rows

That’s because these are just averages over their onset, midpoint, and offset measurements.

vowel_data_longer |>
  # filter(Speaker %in% c("FH20","HD09","LV04","LV17")) |>
  filter(Speaker %in% c("FH20","HD09")) |>
  ggplot(aes(x = F2, y = F1, color=traj_length)) +
  #making slightly transparent points for each individual vowel
  geom_path(aes(group=MatchId), alpha=0.5, arrow=arrow(length=unit(0.05, "inches"))) +
  facet_wrap(~ Speaker) +
  #flipping the scales, like a good vowel plot should
  scale_x_reverse("F1 (ANAE-normalized Hz)") + 
  scale_y_reverse("F2 (ANAE-normalized Hz)") +
  labs(color="Trajectory\nlength (Hz)") +
  theme_classic()

Session Info

sessionInfo()
R version 4.3.2 (2023-10-31)
Platform: x86_64-apple-darwin20 (64-bit)
Running under: macOS 15.3.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: America/Detroit
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] joeyr_0.9       tidynorm_0.3.0  lubridate_1.9.4 forcats_1.0.0  
 [5] stringr_1.5.1   dplyr_1.1.4     purrr_1.1.0     readr_2.1.5    
 [9] tidyr_1.3.1     tibble_3.3.0    ggplot2_3.5.2   tidyverse_2.0.0

loaded via a namespace (and not attached):
 [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.3.2     Rcpp_1.1.0        
 [5] tidyselect_1.2.1   jquerylib_0.1.4    scales_1.4.0       yaml_2.3.10       
 [9] fastmap_1.2.0      R6_2.6.1           labeling_0.4.3     generics_0.1.4    
[13] knitr_1.50         MASS_7.3-60.0.1    bslib_0.9.0        pillar_1.11.0     
[17] RColorBrewer_1.1-3 tzdb_0.5.0         rlang_1.1.6        utf8_1.2.6        
[21] cachem_1.1.0       stringi_1.8.7      xfun_0.52          sass_0.4.10       
[25] timechange_0.3.0   cli_3.6.5          withr_3.0.2        magrittr_2.0.3    
[29] digest_0.6.37      grid_4.3.2         rstudioapi_0.17.1  hms_1.1.3         
[33] lifecycle_1.0.4    vctrs_0.6.5        evaluate_1.0.4     glue_1.8.0        
[37] farver_2.1.2       rmarkdown_2.29     tools_4.3.2        pkgconfig_2.0.3   
[41] htmltools_0.5.8.1